#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#' #Setup filenames

filename <- "Survey Data - Ecuador_Public Use" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

#' #Setup data, functions and create dictionary for dataset review
source (functions_vers)
#'
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 


#!!!Save flagged dictionary in .csv format, add "DatasetReview" to name and continue processing data with subset of flagged variables

#' #Direct PII: variables to be removed
# !!!No direct PII
#'
#' #Direct PII-team: Encode field team names
# !!!No direct PII-team
#'
#' #Small locations: Encode locations  with pop <100,000 using random large numbers
#  !!!No small locations
#' 
#' #Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

# Recode education attainment of adults to reduce risk of re-identification 
# !!!No education attainment of adults.

# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("b_v6", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more individuals. 
mydata <- top_recode ("b_hhsize_2016_masked", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more individuals.
mydata <- top_recode ("b_hhsize_2016_imp", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more individuals.

# Top code high income to the 99.5 percentile
# No income variables to top code

#' #Indirect PII - Categrical: Recode, encode, or Top/bottom coding for extreme values
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("age_at_base",
                  "female",
                  "b_v2",
                  "b_v3_masked",
                  "b_v4",
                  "b_v5",
                  "b_v7",
                  "b_v8",
                  "b_v9",
                  "b_v10",
                  "b_v11",
                  "b_v12",
                  "b_v13",
                  "b_v20_1",
                  "b_v20_2",
                  "b_v20_3",
                  "b_v20_4",
                  "b_v20_5",
                  "b_v20_6",
                  "b_v20_7",
                  "b_v20_8",
                  "b_v20x1_masked",
                  "b_v21_1_masked",
                  "b_v21_2_masked",
                  "b_v21_3_masked",
                  "b_v21_4_masked",
                  "b_v21_5_masked",
                  "b_v21_6_masked",
                  "b_v21_7_masked",
                  "b_v22_1",
                  "b_v22_2",
                  "b_v22_3",
                  "b_v22_4",
                  "b_v23",
                  "b_v24",
                  "b_v25_1",
                  "b_v25_2",
                  "b_v25_3",
                  "b_v25_4",
                  "b_v25_5",
                  "b_v25_6",
                  "b_v25_7",
                  "b_v25_8",
                  "b_v25_9",
                  "b_v25_10",
                  "b_v26_1_masked",
                  "b_v26_2_masked",
                  "b_v26_3_masked",
                  "b_v26_4_masked",
                  "b_v26_5_masked",
                  "b_v26_6_masked",
                  "b_v26_7_masked",
                  "b_v27_1",
                  "b_v27_2",
                  "b_v27_3",
                  "b_v27_4",
                  "b_v28_masked",
                  "b_v29_1",
                  "b_v29_2",
                  "b_v29_3",
                  "b_v29_4",
                  "b_v29_5",
                  "b_v29_6",
                  "b_v29_7",
                  "b_v29_8",
                  "b_v29_9",
                  "b_v29_10",
                  "b_v29_11",
                  "b_v29_13",
                  "b_v29_18",
                  "b_v29_19",
                  "b_v29_masked",
                  "b_v29_other",
                  "b_v30_1",
                  "b_v30_2",
                  "b_v30_3",
                  "b_v30_4",
                  "b_v30_5",
                  "b_v30_6",
                  "b_v30_7",
                  "b_v30_8",
                  "b_v30_9",
                  "b_v30_10",
                  "b_v30_11",
                  "b_v30_other",
                  "b_v31",
                  "b_v32",
                  "b_v32_other",
                  "b_v33",
                  "b_v34",
                  "b_v35_1",
                  "b_v35_2",
                  "b_v35_3",
                  "b_v35_4",
                  "b_v35_5",
                  "b_v35_6",
                  "b_v35_7",
                  "b_v35_8",
                  "b_v35_9",
                  "b_v35_10",
                  "b_v35_11",
                  "b_v35_12",
                  "b_v35_13",
                  "b_v35_14",
                  "b_v35x0",
                  "b_v36_1",
                  "b_v36_2",
                  "b_v36_3",
                  "b_v36_4",
                  "b_v36_5",
                  "b_v36_6",
                  "b_v36_7",
                  "b_v36_other",
                  "b_v37_1",
                  "b_v37_2",
                  "b_v37_3",
                  "b_v37_4",
                  "b_v37_5",
                  "b_v37_6",
                  "b_v37_7",
                  "b_v37_8",
                  "b_v37_9",
                  "b_v37_10",
                  "b_v37_11",
                  "b_v37_12",
                  "b_v37_13",
                  "b_v37_14",
                  "b_v37_15",
                  "b_v37_other",
                  "b_v39",
                  "b_v40",
                  "b_v46_masked",
                  "b_v47_masked",
                  "e_v7_a",
                  "e_v7_b",
                  "e_v7_c",
                  "e_v7_d",
                  "e_v7_e",
                  "e_v7_f",
                  "e_v7_g",
                  "e_v7_h",
                  "e_v7_other_masked",
                  "e_v8_1_masked",
                  "e_v8_2_masked",
                  "e_v8_3_masked",
                  "e_v8_4_masked",
                  "e_v8_5_masked",
                  "e_v8_6_masked",
                  "e_v8_7_masked",
                  "e_v9_a",
                  "e_v9_b",
                  "e_v9_c",
                  "e_v9_d",
                  "e_v10",
                  "e_v11_a",
                  "e_v11_b",
                  "e_v11_c",
                  "e_v11_d",
                  "e_v11_e",
                  "e_v11_f",
                  "e_v11_g",
                  "e_v11_h",
                  "e_v11_i",
                  "e_v11_j",
                  "e_v12_1_masked",
                  "e_v12_2_masked",
                  "e_v12_3_masked",
                  "e_v12_4_masked",
                  "e_v12_5_masked",
                  "e_v12_6_masked",
                  "e_v12_7_masked",
                  "e_v13_a",
                  "e_v13_b",
                  "e_v13_c",
                  "e_v13_d",
                  "e_v14_masked",
                  "e_v15_2",
                  "e_v15_3",
                  "e_v15_4",
                  "e_v15_5",
                  "e_v15_6",
                  "e_v15_7",
                  "e_v15_8",
                  "e_v15_9",
                  "e_v15_10",
                  "e_v15_11",
                  "e_v15_12",
                  "e_v15_13",
                  "e_v15_14",
                  "e_v15_15",
                  "e_v15_16",
                  "e_v15_masked",
                  "e_v15_other",
                  "e_v15_other_hazardous",
                  "e_v16_1",
                  "e_v16_2",
                  "e_v16_3",
                  "e_v16_4",
                  "e_v16_5",
                  "e_v16_6",
                  "e_v16_7",
                  "e_v16_8",
                  "e_v16_9",
                  "e_v16_10",
                  "e_v16_11",
                  "e_v16_other",
                  "e_v17",
                  "e_v18",
                  "e_v19",
                  "e_v20",
                  "e_v21_6meses",
                  "e_v21_a",
                  "e_v21_b",
                  "e_v21_c",
                  "e_v21_d",
                  "e_v21_e",
                  "e_v21_f",
                  "e_v21_g",
                  "e_v21_h",
                  "e_v21_i",
                  "e_v21_j",
                  "e_v21_k",
                  "e_v21_l",
                  "e_v21_m",
                  "e_v21_n",
                  "e_v22_a",
                  "e_v22_b",
                  "e_v22_c",
                  "e_v22_d",
                  "e_v22_e",
                  "e_v22_f",
                  "e_v22_g",
                  "e_v22_other",
                  "e_v23_a",
                  "e_v23_b",
                  "e_v23_c",
                  "e_v23_d",
                  "e_v23_e",
                  "e_v23_f",
                  "e_v23_g",
                  "e_v23_h",
                  "e_v23_i",
                  "e_v23_j",
                  "e_v23_k",
                  "e_v23_l",
                  "e_v23_m",
                  "e_v23_o_masked",
                  "e_v23_other",
                  "e_v25",
                  "e_v26",
                  "e_v31_masked",
                  "e_v32_masked",
                  "b_afro_2016",
                  "b_any_dustfumes_2016",
                  "b_any_harrassm_2016",
                  "b_any_injury_2016",
                  "b_casual2_2016",
                  "b_casual_2016",
                  "b_chorehrs_2016_masked",
                  "b_vulnerable2_2016",
                  "b_vulnerable_2016",
                  "b_fnat_2016",
                  "b_fspan_2016",
                  "b_gangcurr_2016",
                  "b_gangpast_2016",
                  "b_hcl_2016",
                  "b_hhchores_2016",
                  "b_hz_industry_2016",
                  "b_hz_occupation_2016",
                  "b_indig_2016",
                  "b_irregular_empl2_2016",
                  "b_irregular_empl_2016",
                  "b_long_hrs_2016",
                  "b_married_2016",
                  "b_mestizo_2016",
                  "b_mnat_2016",
                  "b_montub_2016",
                  "b_mspan_2016",
                  "b_night_work_2016",
                  "b_nightchores_2016",
                  "b_older_2016",
                  "b_othrace_2016",
                  "b_ownchild_2016",
                  "b_sameage_2016",
                  "b_temporary_noncasual2_2016",
                  "b_temporary_noncasual_2016",
                  "b_total_hrs_2016_masked",
                  "b_white_2016",
                  "b_working_2016",
                  "b_yearsout_2016",
                  "b_younger_2016",
                  "b_afro_2016_imp",
                  "b_any_dustfumes_2016_imp",
                  "b_any_harrassm_2016_imp",
                  "b_any_injury_2016_imp",
                  "b_chorehrs_2016_imp",
                  "b_fspan_2016_imp",
                  "b_gangcurr_2016_imp",
                  "b_hcl_2016_imp",
                  "b_hz_industry_2016_imp",
                  "b_hz_occupation_2016_imp",
                  "b_irregular_empl2_2016_imp",
                  "b_irregular_empl_2016_imp",
                  "b_long_hrs_2016_imp",
                  "b_married_2016_imp",
                  "b_mestizo_2016_imp",
                  "b_mspan_2016_imp",
                  "b_night_work_2016_imp",
                  "b_nightchores_2016_imp",
                  "b_older_2016_imp",
                  "b_othrace_2016_imp",
                  "b_ownchild_2016_imp",
                  "b_sameage_2016_imp",
                  "b_total_hrs_2016_imp",
                  "b_white_2016_imp",
                  "b_working_2016_imp",
                  "b_yearsout_2016_imp",
                  "b_younger_2016_imp",
                  "e_any_dustfumes_2017",
                  "e_any_harrassm_2017",
                  "e_any_injury_2017",
                  "e_casual_2017",
                  "e_chorehrs_2017_masked",
                  "e_gangcurr_2017",
                  "e_gangpast_2017",
                  "e_hcl_2017",
                  "e_hhchores_2017",
                  "e_hz_industry_2017",
                  "e_hz_occupation_2017",
                  "e_irregular_empl2_2017",
                  "e_irregular_empl_2017",
                  "e_long_hrs_2017",
                  "e_night_work_2017",
                  "e_nightchores_2017",
                  "e_shortcontract_2017",
                  "e_vulnerable_2017",
                  "e_temporary_noncasual_2017",
                  "e_total_hrs_2017_masked",
                  "e_working_2017")

capture_tables (indirect_PII)

# Recode those with very specific values. 

mydata <- mydata[!names(mydata) %in% c("b_v31", "e_v17")] # Too detailed, open-ended verbatim in Spanish

# Recode races or ethnicity to reduce risk of re-identification

break_edu <- c(1,2,3,88,99)
labels_edu <- c("1. White" =1,
                "2. Mestizo" = 2,
                "3. Other" =3)
mydata <- ordinal_recode (variable="b_v9", break_points=break_edu, missing=8888, value_labels=labels_edu)

#Drop variables that shows the same information as b_v9.

dropvars <- c("b_afro_2016","b_indig_2016","b_montub_2016","b_othrace_2016", "b_afro_2016_imp","b_othrace_2016_imp") 
mydata <- mydata[!names(mydata) %in% dropvars]

#Recode mother and father's native language to reduce risk of re-identification

break_edu <- c(1,2,4,5)
labels_edu <- c("1. Spanish" =1,
                "2. Other" = 2,
                "3. Not sure"=3,
                "4. No response"=4)
mydata <- ordinal_recode (variable="b_v7", break_points=break_edu, missing=8888, value_labels=labels_edu)
mydata <- ordinal_recode (variable="b_v8", break_points=break_edu, missing=8888, value_labels=labels_edu)

#' #Matching and crosstabulations: Run automated PII check 

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('age_at_base', 'female', 'b_v9') ##!!! Replace with candidate categorical demo vars

# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial

#' #Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
#' !!! No Open-ends
#'
#' #GPS data: Displace
# !!!No GPS data
#'
#' #Save processed data in Stata and SPSS format
#' Adds "_PU" (Public Use) to the end of the name 

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
